# Module 2 Demo — Data Manipulation
# Based on R4DS Chapters 18 (Pipes), 5 (Data Transformation), 10 (Tibbles), 11 (Data Import)
# Hands-on practice with clinical data manipulation

# Load required libraries
library(tidyverse)    # Includes dplyr, tibble, readr and %>% operator
library(lubridate)    # Date manipulation
library(haven)        # SAS/XPT file import
library(readxl)       # Excel file import

# ===============================
# Part 1: Data Import (R4DS Ch. 11)
# ===============================

# In clinical studies, data comes from various sources
# Let's simulate importing different file types

cat("=== DATA IMPORT DEMONSTRATION ===\n")

# Create sample data to demonstrate import methods
# (This simulates data that would come from EDC, Excel, or SAS)

# Method 1: Create tibble directly (best practice vs data.frame)
dm <- tibble(
  USUBJID = c("001-001", "001-002", "001-003", "001-004", "001-005", "001-006"),
  AGE = c(25, 45, 67, 52, 71, 34),
  SEX = c("F", "M", "F", "M", "F", "M"),
  RFSTDTC = c("2024-01-15", "2024-01-16", "2024-01-17", "2024-01-18", "2024-01-19", "2024-01-20"),
  COUNTRY = c("USA", "USA", "CAN", "USA", "CAN", "USA"),
  RACE = c("WHITE", "BLACK OR AFRICAN AMERICAN", "WHITE", "ASIAN", "WHITE", "WHITE")
)

cat("Demographics dataset created as tibble:\n")
# Tibbles print better than data.frames (R4DS Ch. 10)
print(dm)

cat("\nDataset structure (glimpse shows types and dimensions):\n")
glimpse(dm)

# ===============================
# Part 2: Pipe Workflow (R4DS Ch. 18)
# ===============================

cat("\n=== PIPE OPERATOR DEMONSTRATION ===\n")

# Without pipes (nested, hard to read):
# result <- arrange(select(filter(dm, AGE >= 65), USUBJID, AGE), AGE)

# With pipes (readable, left-to-right):
elderly_summary <- dm %>%
  filter(AGE >= 65) %>%     # Step 1: Filter elderly subjects
  select(USUBJID, AGE) %>%  # Step 2: Select relevant columns
  arrange(AGE)              # Step 3: Sort by age

cat("Elderly subjects (using pipe workflow):\n")
print(elderly_summary)

# ===============================
# Part 3: Data Transformation Overview (R4DS Ch. 5)
# ===============================

cat("\n=== DATA TRANSFORMATION: THE FIVE VERBS ===\n")
cat("1. filter()  - Choose rows based on values\n")
cat("2. select()  - Choose columns by name\n")
cat("3. mutate()  - Create new variables\n")
cat("4. arrange() - Change row order\n")
cat("5. summarise() - Create summary statistics\n\n")

# ===============================
# Part 4: filter() - Subset Rows
# ===============================

# Filter adults only (age >= 18) - though all are adults in this example
adults <- dm %>%
  filter(AGE >= 18)

cat("All subjects (adults only):\n")
print(adults)

# Filter elderly subjects (age >= 65)
elderly <- dm %>%
  filter(AGE >= 65)

cat("\nElderly subjects (age >= 65):\n")
print(elderly)

# Multiple conditions - elderly females
elderly_females <- dm %>%
  filter(AGE >= 65 & SEX == "F")

cat("\nElderly female subjects:\n")
print(elderly_females)

# Filter by country
usa_subjects <- dm %>%
  filter(COUNTRY == "USA")

cat("\nUSA subjects:\n")
print(usa_subjects)

# ===============================
# Part 5: select() - Choose Columns
# ===============================

# Select specific columns
basic_demo <- dm %>%
  select(USUBJID, AGE, SEX)

cat("\nBasic demographics (select specific columns):\n")
print(basic_demo)

# Drop columns (exclude RACE and COUNTRY)
dm_no_race_country <- dm %>%
  select(-RACE, -COUNTRY)

cat("\nWithout race and country:\n")
print(dm_no_race_country)

# ===============================
# Part 6: mutate() - Create/Modify Variables
# ===============================

# Create elderly flag (age >= 65)
dm <- dm %>%
  mutate(ELDERLY = ifelse(AGE >= 65, "Y", "N"))

cat("\nWith elderly flag:\n")
print(dm)

# Create multiple derived variables
dm <- dm %>%
  mutate(
    # Convert RFSTDTC to proper Date format
    RFSTDT = ymd(RFSTDTC),

    # Create age groups using case_when
    AGEGRP = case_when(
      AGE < 40 ~ "Young Adult",
      AGE >= 40 & AGE < 65 ~ "Middle Age",
      AGE >= 65 ~ "Elderly"
    ),

    # Numeric version of age groups
    AGEGRPN = case_when(
      AGE < 40 ~ 1,
      AGE >= 40 & AGE < 65 ~ 2,
      AGE >= 65 ~ 3
    ),

    # Create sex flag
    FEMALE = ifelse(SEX == "F", "Y", "N")
  )

cat("\nWith multiple derived variables:\n")
glimpse(dm)

# ===============================
# Part 7: arrange() - Sort Data
# ===============================

# Sort by age (ascending)
dm_sorted_age <- dm %>%
  arrange(AGE)

cat("\nSorted by age (ascending):\n")
print(dm_sorted_age)

# Sort by age (descending)
dm_sorted_age_desc <- dm %>%
  arrange(desc(AGE))

cat("\nSorted by age (descending):\n")
print(dm_sorted_age_desc)

# Sort by multiple variables
dm_sorted_multi <- dm %>%
  arrange(COUNTRY, SEX, AGE)

cat("\nSorted by country, then sex, then age:\n")
print(dm_sorted_multi)

# ===============================
# Part 8: summarise() - Create Summary Statistics
# ===============================

cat("\n=== SUMMARISE(): CREATE SUMMARY STATISTICS ===\n")

# Basic summary statistics
age_summary <- dm %>%
  summarise(
    n_subjects = n(),
    mean_age = mean(AGE),
    median_age = median(AGE),
    min_age = min(AGE),
    max_age = max(AGE),
    sd_age = sd(AGE)
  )

cat("Overall age statistics:\n")
print(age_summary)

# Group summaries (like PROC MEANS BY)
sex_summary <- dm %>%
  group_by(SEX) %>%
  summarise(
    n = n(),
    mean_age = round(mean(AGE), 1),
    elderly_count = sum(AGE >= 65),
    elderly_pct = round(100 * mean(AGE >= 65), 1),
    .groups = "drop"  # Remove grouping after summarise
  )

cat("\nAge statistics by sex:\n")
print(sex_summary)

# Complex clinical trial summary
trial_summary <- dm %>%
  group_by(COUNTRY, SEX) %>%
  summarise(
    subjects = n(),
    mean_age = round(mean(AGE), 1),
    age_range = paste(min(AGE), "-", max(AGE)),
    elderly_subjects = sum(AGE >= 65),
    .groups = "drop"
  )

cat("\nTrial enrollment summary by country and sex:\n")
print(trial_summary)

# ===============================
# Part 9: Combining Operations with Pipes
# ===============================

# Complex pipeline: Filter, mutate, arrange, select
processed_dm <- dm %>%
  filter(AGE >= 18) %>%                    # Adults only
  mutate(BMI_CATEGORY = case_when(         # Add BMI category (simulated)
    AGE < 30 ~ "Young",
    AGE < 50 ~ "Middle",
    TRUE ~ "Mature"
  )) %>%
  arrange(USUBJID) %>%                     # Sort by subject ID
  select(USUBJID, AGE, SEX, ELDERLY, AGEGRP, BMI_CATEGORY)  # Keep relevant columns

cat("\nProcessed demographics (full pipeline):\n")
print(processed_dm)

# ===============================
# Part 10: SAS vs R Comparison Example
# ===============================

cat("\n=== SAS vs R Comparison ===\n")
cat("SAS DATA Step equivalent:\n")
cat("DATA dm;\n")
cat("  SET raw_dm;\n")
cat("  IF AGE >= 65 THEN ELDERLY = 'Y';\n")
cat("  ELSE ELDERLY = 'N';\n")
cat("  \n")
cat("  IF AGE < 40 THEN AGEGRP = 'Young Adult';\n")
cat("  ELSE IF AGE < 65 THEN AGEGRP = 'Middle Age';\n")
cat("  ELSE AGEGRP = 'Elderly';\n")
cat("RUN;\n")
cat("\nR dplyr equivalent:\n")
cat("dm <- raw_dm %>%\n")
cat("  mutate(\n")
cat("    ELDERLY = ifelse(AGE >= 65, 'Y', 'N'),\n")
cat("    AGEGRP = case_when(\n")
cat("      AGE < 40 ~ 'Young Adult',\n")
cat("      AGE < 65 ~ 'Middle Age',\n")
cat("      TRUE ~ 'Elderly'\n")
cat("    )\n")
cat("  )\n")

# ===============================
# Part 11: Data Import Examples (Clinical Files)
# ===============================

cat("\n=== CLINICAL DATA IMPORT EXAMPLES ===\n")
cat("Note: These examples show how you would import real clinical data files\n\n")

# Example 1: CSV from EDC export
cat("# Import CSV file from EDC:\n")
cat("dm_csv <- read_csv('data/demographics.csv', \n")
cat("                   col_types = cols(\n")
cat("                     USUBJID = col_character(),\n")
cat("                     AGE = col_double(),\n")
cat("                     SEX = col_character()\n")
cat("                   ))\n\n")

# Example 2: Excel lab data
cat("# Import Excel lab data:\n")
cat("lab_data <- read_excel('data/lab_results.xlsx', \n")
cat("                       sheet = 'Chemistry',\n")
cat("                       skip = 2)  # Skip header rows\n\n")

# Example 3: SAS legacy data
cat("# Import SAS datasets:\n")
cat("dm_sas <- read_sas('legacy_data/dm.sas7bdat')\n")
cat("dm_xpt <- read_xpt('submission/dm.xpt')  # Regulatory submission\n\n")

# ===============================
# Part 12: GitHub Copilot Practice
# ===============================

cat("=== GitHub Copilot in RStudio Practice ===\n")
cat("Try writing these comments and see what Copilot suggests in RStudio:\n\n")

# Create a treatment assignment flag for subjects over 50


# Calculate days since reference start date


# Create a flag for subjects from North America (USA or CAN)
dm <- dm %>%
  mutate(NORTH_AMERICA = ifelse(COUNTRY %in% c("USA", "CAN"), "Y", "N"))

cat("\nFinal dataset with North America flag:\n")
print(dm)

# ----------------------------
# Module 2 Demo Complete!
# ----------------------------

cat("\n🎉 Module 2 Demo Complete!\n")
cat("You've practiced:\n")
cat("- filter() for subsetting rows\n")
cat("- select() for choosing columns\n")
cat("- mutate() for creating variables\n")
cat("- arrange() for sorting data\n")
cat("- Combining operations with pipes\n")
cat("- SAS vs R comparisons\n")
cat("- GitHub Copilot in RStudio assistance\n")
cat("\nReady for more advanced data wrangling in Module 3!\n")
